This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

library(tidyplots)

Attaching package: ‘tidyplots’

The following object is masked from ‘package:ggpubr’:

    gene_expression

>>>> Figure 1: plots of TE experiment data - read counts and read depths

Read-in data

#ATCC genomes updated version
#October 2024

#viral load vs read count normalised using different methods, comparing deduplicated and non-deduplicated
#use bowtie2 data

####read counts/normalised read counts####

#metadata

metadata <- read.csv("metadata/sampleIDs_TESpikeIn.csv", header = TRUE)

metadata2 <- metadata |>
  select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
  rename(Sample_id = Sample.ID) |>
  rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)

#import and combine read count files
#deduplicated and non-deduplicated

counts_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_nodedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_bt_all <- rbind(counts_dedup_bt, counts_nodedup_bt)

counts_reads <- left_join(counts_bt_all, metadata2, by = "Sample_id")

#normalise by both raw read count and genome length - should be the same as mean read depth

counts_reads_norm <- counts_reads |>
  mutate(norm_counts1 = matched / QC_reads) |>
  mutate(norm_counts2 = matched / QC_reads / length) |>
  mutate(norm_counts3 = matched / length) |>
  mutate(genome_structure = case_when((
    virus == "Human_adenovirus_40" |
      virus == "Human_betaherpesvirus"
  ) ~ "DNA",
  .default = "RNA"
  ))

Format data

Make plots

plot just the two lefthand panels from the original figure

plot viral read counts (non normalised)

Final plot

——————–

Figure S1

Read depths

depth_nodedup_bt <-
  read_tsv(
    "data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv"
  )
Rows: 95 Columns: 12── Column specification ─────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (5): virus, Sample_id, Background, mapper, type
dbl (7): Viral load, mean_depth, median_depth, min, max, LQ, UQ
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Make plots


read_depths <- 
  depths_reads |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log(mean_depth),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 0, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  scale_x_log10() +
  ylab("Log(mean read depth)")

#ggsave("figures/compare_spike_ins_atcc/mean_depth.pdf",width=8,height=6)

Final plot

ggarrange(
  read_count,
  read_count_norm1,
  read_count_norm2,
  read_depths,
  nrow = 2,
  ncol = 2,
  common.legend = TRUE,
  align = "hv"
)
Error: object 'read_count' not found

——————–

Figure S2

Reaed-in data / Make plots

#metadata

metadata <-
  read.csv(
    "metadata/sampleIDs_TESpikeIn.csv",
    header = TRUE
  )

metadata2 <- metadata |>
  select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
  rename(Sample_id = Sample.ID) |>
  rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)

#import and combine read count files
#deduplicated and non-deduplicated

counts_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_nodedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_bt_all <- rbind(counts_dedup_bt, counts_nodedup_bt)

counts_reads <- left_join(counts_bt_all, metadata2, by = "Sample_id")

#normalise by both raw read count and genome length - should be the same as mean read depth

counts_reads_norm <- counts_reads |>
  mutate(norm_counts1 = matched / QC_reads) |>
  mutate(norm_counts2 = matched / QC_reads / length) |>
  mutate(norm_counts3 = matched / length) |>
  mutate(genome_structure = case_when((
    virus == "Human_adenovirus_40" |
      virus == "Human_betaherpesvirus"
  ) ~ "DNA",
  .default = "RNA"
  ))

####compare library capture pool####

cols3 <- c("#228833", "#AA3377")

facet_names <- c("dedup_TE" = "Deduplicated",
                 "nodedup_TE" = "Non-Deduplicated")

metadata3 <- 
  metadata |>
  select(Sample.ID, Pool.for.sequencing) |>
  rename(Sample_id = Sample.ID) |>
  rename(Pool = Pool.for.sequencing)

counts_pool <- left_join(counts_reads_norm, metadata3, by = "Sample_id")

Make plots


pool_readcounts <- 
  counts_pool |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(x = virus, y = log(matched))) +
  geom_boxplot() +
  facet_grid(Pool ~ type) +
  theme_few() +
  theme(
    # axis.title.x = element_blank(),
    # axis.text.x = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  ylab("Log (Viral Reads)") + xlab("Viral load (copies / ml)") +
  scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2"))

pool_counts_sum <- 
  counts_pool |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = as.character(Viral.load),
    y = log(matched),
    colour = Pool
  )) +
  geom_boxplot() +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    # axis.title.x = element_blank(),
    # axis.text.x = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  ylab("Log (Viral Reads)") + xlab("Viral load (copies / ml)") +
  scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2"))

pool_readcounts_norm <- 
  counts_pool |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(x = virus, y = log(norm_counts1))) +
  geom_boxplot() +
  facet_grid(Pool ~ type) +
  theme_few() +
  theme(
    # axis.title.x = element_blank(),
    # axis.text.x = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  ylab("Log(viral reads/cleaned reads)") +  xlab("Viral load (copies / ml)")

pool_norm1_sum <- 
  counts_pool |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = as.character(Viral.load),
    y = log(matched),
    colour = Pool
  )) +
  geom_boxplot() +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    # axis.title.x = element_blank(),
    # axis.text.x = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  ylab("Log(viral reads/cleaned reads)") +   xlab("Viral load (copies / ml)") +
  scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2"))

pool_readcounts_norm2 <- 
  counts_pool |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(x = virus, y = log(norm_counts2))) +
  geom_boxplot() +
  facet_grid(Pool ~ type) +
  theme_few() +
  theme(
    # axis.title.x = element_blank(),
    # axis.text.x = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  ylab("Log(viral reads/cleaned reads/genome length)") + xlab("Viral load (copies / ml)")


pool_norm2_sum <- 
  counts_pool |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = as.character(Viral.load),
    y = log(matched),
    colour = Pool
  )) +
  geom_boxplot() +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    # axis.title.x = element_blank(),
    # axis.text.x = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  ylab("Log(viral reads/cleaned reads/genome length)") +
  scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2")) + 
  xlab("Viral load (copies / ml)")


#import and combine read depth files

depth_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

depth_nodedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

depths_bt_all <- rbind(depth_dedup_bt, depth_nodedup_bt)

depths_reads <- 
  left_join(depths_bt_all, metadata2, by = "Sample_id") |>
  mutate(genome_structure = case_when((
    virus == "Human_adenovirus_40" |
      virus == "Human_betaherpesvirus"
  ) ~ "DNA",
  .default = "RNA"
  ))

depths_pool <- left_join(depths_reads, metadata3, by = "Sample_id")

pool_readdepths <- 
  depths_pool |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(x = virus, y = log(mean_depth))) +
  geom_boxplot() +
  facet_grid(Pool ~ type) +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  ylab("Log (mean read depth)") + xlab("Viral load (copies / ml)")

pool_depths_sum <- 
  depths_pool |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = as.character(Viral.load),
    y = log(mean_depth),
    colour = Pool
  )) +
  geom_boxplot() +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    # axis.title.x = element_blank(),
    # axis.text.x = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  ylab("Log (mean read depth)") +
  xlab("Viral load (copies / ml)") +
  scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2"))

Final plot

ggarrange(
  pool_counts_sum,
  pool_norm1_sum,
  pool_norm2_sum,
  pool_depths_sum,
  nrow = 2,
  ncol = 2,
  common.legend = TRUE,
  align = "hv"
)

#ggsave("figures/compare_spike_ins_atcc/pool_compare_viruses_sum.png",width=10,height=7)

#ggsave("figures/manuscript_figures_pdf/FigureS2.pdf",width=10,height=7)

——————–

Figure S3


##plots of read counts and viral read counts
#November 2024

#metadata

metadata <-
  read.csv(
    "metadata/sampleIDs_TESpikeIn.csv",
    header = TRUE
  )

metadata2 <- metadata |>
  select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
  rename(Sample_id = Sample.ID) |>
  rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)

#import read count files

counts_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_nodedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

#import viral reads mapped (to calculate proportions)

viral_reads_dedup <-
  read.csv(
    "data_TE/total_virus_mapped_reads_per_sample_dedup_atcc_ref_20241108.csv",
    header = TRUE
  )

viral_reads_nodedup <-
  read.csv(
    "data_TE/total_virus_mapped_reads_per_sample_nodedup_atcc_ref_20241108.csv",
    header = TRUE
  )

cols2 <- c("#BB5566", "#004488")

facet_names <- c("dedup_TE" = "Deduplicated",
                 "nodedup_TE" = "Non-Deduplicated")

reads_metadata_dedup <-
  left_join(counts_dedup_bt, metadata2, by = "Sample_id")

reads_viral_dedup <-
  left_join(reads_metadata_dedup, viral_reads_dedup, by = "Sample_id")

reads_metadata_nodedup <-
  left_join(counts_nodedup_bt, metadata2, by = "Sample_id")

reads_viral_nodedup <-
  left_join(reads_metadata_nodedup, viral_reads_nodedup, by = "Sample_id")

reads_viral_all <- rbind(reads_viral_dedup, reads_viral_nodedup)


reads_plot <- 
  reads_viral_all |>
  group_by(Background, Sample_id, Viral.load, type) |>
  summarise(
    total_reads = (QC_reads * 2),
    viral_reads = total_virus_reads,
    ATCC_reads = sum(matched),
    prop_ATCC = ATCC_reads / total_reads,
    prop_viral = total_virus_reads / total_reads,
    diff = viral_reads - ATCC_reads
  ) |>
  unique()

Make plots

Final plot

——————–

Figure S4

Make plots

####read counts/depths split by background####

#change labels in facet plots

facet_names_bg <- c(
  "dedup_TE" = "Deduplicated",
  "nodedup_TE" = "Non-Deduplicated",
  "p2" = "P1",
  "p8" = "P2"
)

#break down by Background x virus

background_counts_reads <- 
  counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log(matched),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels =
      c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
      )
  ) +
  scale_x_log10(labels = scales::label_log()) +
  ylab("Log(Viral Reads)")

backgrounds_counts_reads_norm <- 
  counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log(norm_counts1),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
  theme_bw() +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  scale_x_log10(label = scales::label_log()) +
  ylab("Log(viral reads/cleaned reads)")

backgrounds_counts_reads_norm2 <- 
  counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log(norm_counts2),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  scale_x_log10(labels = scales::label_log()) +
  ylab("Log(viral reads/cleaned reads/genome length)")

background_read_depths <- 
  depths_reads |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log(mean_depth),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray"),
    axis.title.y = element_text(size = 10),
    legend.title = element_blank()
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  scale_x_log10(label = scales::label_log()) +
  ylab("Log(mean read depth)")

Final plot

——————–

Figure S5 - linear model

Read-in data

Format data


depths_bt_all <- rbind(depth_dedup_bt, depth_nodedup_bt)

depths_reads <- left_join(depths_bt_all, metadata2, by = "Sample_id") |>
  mutate(genome_structure = case_when((
    virus == "Human_adenovirus_40" |
      virus == "Human_betaherpesvirus"
  ) ~ "DNA",
  ,
  .default = "RNA"
  ))

####linear model for spike in viruses - mean read depth####

depths_reads_sub <- depths_reads |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  filter(type == "dedup_TE") |>
  mutate(log_depth = log(mean_depth)) |>
  mutate(log_viral_load = log10(Viral.load))

#inspect data
hist(depths_reads_sub$mean_depth)
hist(depths_reads_sub$Viral.load)
hist(depths_reads_sub$log_depth)
hist(depths_reads_sub$log_viral_load)

summary(depths_reads_sub$log_depth)
summary(depths_reads_sub$log_viral_load)

model

lm_tidy <- 
  do.call(rbind.data.frame, tidy(list_models))
Error: No tidy method recognized for this list.

——————–

>>> Figure 2: plots of TE experiment data - genome coverage & per site coverage

Read-in data


#plots of TE experiment data - genome coverage & per site coverage
#also separated by viruses
#ATCC genomes updated version
#October 2024

#use bowtie2 data

####genome coverage####

unzip(zipfile = "data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_dedup_atcc_ref.tsv.zip", exdir = "data_TE/")

unzip(zipfile = "data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv.zip", exdir = "data_TE/")

dedup_per_site <-
  read_tsv(
    "data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_dedup_atcc_ref.tsv"
  )


nodedup_per_site <-
  read_tsv(
    "data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv"
  )

file.remove("data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_dedup_atcc_ref.tsv")

file.remove( "data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv")

#add length column from read depth file to per site file

#metadata

metadata <-
  read.csv(
    "metadata/sampleIDs_TESpikeIn.csv",
    header = TRUE
  )

metadata2 <- 
  metadata |>
  select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
  rename(Sample_id = Sample.ID) |>
  rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)

#import and combine read count files

#deduplicated and non-deduplicated

counts_dedup_bt <-
  read_tsv(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv"
  )

counts_nodedup_bt <-
  read_tsv(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv"
  )

counts_bt_all <- rbind(counts_dedup_bt, counts_nodedup_bt)

counts_reads <- left_join(counts_bt_all, metadata2, by = "Sample_id")

Format data

#normalise by both raw read count and genome length - should be the same as mean read depth

counts_reads_norm <- 
  counts_reads |>
  mutate(norm_counts1 = matched / QC_reads) |>
  mutate(norm_counts2 = matched / QC_reads / length) |>
  mutate(norm_counts3 = matched / length) |>
  mutate(genome_structure = case_when((
    virus == "Human_adenovirus_40" |
      virus == "Human_betaherpesvirus"
  ) ~ "DNA",
  .default = "RNA"
  ))

lengths <- 
  counts_reads_norm |>
  select(virus, length) |>
  distinct()

#calculate genome coverage

persite_coverage_dedup <- 
  dedup_per_site |>
  rename(Viral.load = `Viral load`) |>
  group_by(Sample_id, virus, Viral.load, Background, type) |>
  filter(coverage > 0) |>
  summarise(genome_coverage = n()) |>
  left_join(lengths) |>
  mutate(percent_coverage = genome_coverage / length)

persite_coverage_nodedup <- 
  nodedup_per_site |>
  rename(Viral.load = `Viral load`) |>
  group_by(Sample_id, virus, Viral.load, Background, type) |>
  filter(coverage > 0) |>
  summarise(genome_coverage = n()) |>
  left_join(lengths) |>
  mutate(percent_coverage = genome_coverage / length)

persite_coverage_both <-
  rbind(persite_coverage_dedup, 
        persite_coverage_nodedup)

coverage_labels <- c("0" = "Control",
                     "100" = "10^{2}",
                     "1000" = "10^{3}",
                     "10000" = "10^{5}",
                     "dedup_TE" = "Deduplicated",
                     "nodedup_TE"= "Non-Deduplicated")

coverage_labels2 <- c("0" = "Control",
                      "100" = "10^{2}",
                      "1000" = "10^{3}",
                      "10000" = "10^{5}")

Make plots

Final plot

——————–

——————–

Figure S6


persite_both <- rbind(dedup_per_site, nodedup_per_site)

persite_reads <- left_join(persite_both, metadata2, by = "Sample_id")

persite_norm <- persite_reads |>
  mutate(norm_cov = coverage / QC_reads)

coverage_labels3 <-c("100" = "1e+02",
                    "1000" = "1e+03",
                    "100000" = "1e+05",
                    "p2" = "P1",
                    "p8"= "P2")

persite_norm$`Viral load` <- 
  factor(persite_norm$`Viral load`, 
         labels = c("0",
                    "10^{2}", 
                    "10^{3}",
                    "10^{5}"))


persite_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  filter(type == "dedup_TE") |>
  filter(virus == "Human_respiratory_syncytial_virus") |>
  rename(Viral.load = `Viral load`) |>
  ggplot(aes(x = site, y = coverage, fill = Background)) +
  geom_col() +
  facet_wrap(Viral.load ~ Sample_id, label = label_parsed) +
  ylab("Log(Coverage)") +
  ggtitle("Human RSV (deduplicated)") +
  guides(fill = guide_legend(title = "Background")) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_y_log10() +
  scale_fill_manual(values = cols2, labels = c("P1", "P2"))

Final plot

#ggsave("figures/manuscript_figures_pdf/FigureS5.pdf",width=10,height=8)

——————–

Figure S7


persite_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  filter(type == "dedup_TE") |>
  filter(virus == "Zika_virus") |>
  ggplot(aes(x = site, y = coverage, fill = Background)) +
  geom_col() +
  facet_wrap(Viral.load ~ Sample_id) +
  ylab("Log(Coverage)") +
  ggtitle("Zika virus (deduplicated)") +
  guides(fill = guide_legend(title = "Background")) +
  theme_bw() +
  scale_y_log10() +
  scale_fill_manual(values = cols2, labels = c("P1", "P2"))

#ggsave("figures/manuscript_figures_pdf/FigureS6.pdf",width=10,height=8)

——————–

Figure S8

persite_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  filter(type == "dedup_TE") |>
  filter(virus == "Human_adenovirus_40") |>
  ggplot(aes(x = site, y = coverage, fill = Background)) +
  geom_col() +
  facet_wrap(Viral.load ~ Sample_id) +
  ylab("Log(Coverage)") +
  ggtitle("Human Adenovirus (deduplicated)") +
  guides(fill = guide_legend(title = "Background")) +
  theme_bw() +
  scale_y_log10() +
  scale_fill_manual(values = cols2, labels = c("P1", "P2"))

#ggsave("figures/manuscript_figures_pdf/FigureS7.pdf",width=20,height=18)

——————–

Figure S9

persite_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  filter(type == "dedup_TE") |>
  filter(virus == "Human_betaherpesvirus") |>
  ggplot(aes(x = site, y = coverage, fill = Background)) +
  geom_polygon() +
  facet_wrap(Viral.load ~ Sample_id) +
  ylab("Log(Coverage)") +
  ggtitle("Human Betaherpesvirus (deduplicated)") +
  guides(fill = guide_legend(title = "Background")) +
  theme_bw() +
  scale_y_log10() +
  scale_fill_manual(values = cols2, labels = c("P1", "P2"))

#ggsave("figures/compare_spike_ins_atcc/HBHV_coverage_test.pdf",width=20,height=18)

——————–

Figure S10

#have to do these differently due to segmentation

FLU_ME1 <- persite_norm |>
  filter(virus == "Influenza_B_virus") |>
  filter(type == "dedup_TE") |>
  filter(Background == "p2") |>
  ggplot(aes(
    x = site,
    y = coverage,
    fill = as.character(Viral.load)
  )) +
  geom_col() +
  facet_grid(seg ~ Sample_id) +
  ylab("Log(Coverage)") +
  ggtitle("Influenza B virus (Background 1 deduplicated)") +
  guides(fill = guide_legend(title = "Viral load")) +
  theme_bw() +
  scale_y_log10() +
  scale_fill_manual(values = cols4,
                    labels = c("1e+02", "1e+03", "1e+05"))

FLU_ME2 <- persite_norm |>
  filter(virus == "Influenza_B_virus") |>
  filter(type == "dedup_TE") |>
  filter(Background == "p8") |>
  ggplot(aes(
    x = site,
    y = coverage,
    fill = as.character(Viral.load)
  )) +
  geom_col() +
  facet_grid(seg ~ Sample_id) +
  ylab("Log(Coverage)") +
  ggtitle("Influenza B virus (Background 2 deduplicated)") +
  guides(fill = guide_legend(title = "Viral load")) +
  theme_bw() +
  scale_y_log10() +
  scale_fill_manual(values = cols4,
                    labels = c("1e+02", "1e+03", "1e+05"))

ggarrange(FLU_ME1, FLU_ME2, ncol = 2, common.legend = TRUE)

#ggsave("figures/manuscript_figures_pdf/FigureS9.pdf",width=20,height=18)

——————–

Figure S11


REO_ME1 <- persite_norm |>
filter(virus == "Mammalian_orthoreovirus3") |>
filter(type == "dedup_TE") |>
filter(Background == "p2") |>
ggplot(aes(
x = site,
y = coverage,
fill = as.character(Viral.load)
)) +
geom_col() +
facet_grid(seg ~ Sample_id) +
ylab("Log(Coverage)") +
ggtitle("Mammalian orthoreovirus 3 (Background 1 deduplicated)") +
guides(fill = guide_legend(title = "Viral load")) +
theme_bw() +
scale_y_log10() +
scale_fill_manual(values = cols4,
labels = c("1e+02", "1e+03", "1e+05"))

REO_ME2 <- persite_norm |>
filter(virus == "Mammalian_orthoreovirus3") |>
filter(type == "dedup_TE") |>
filter(Background == "p8") |>
ggplot(aes(
x = site,
y = coverage,
fill = as.character(Viral.load)
)) +
geom_col() +
facet_grid(seg ~ Sample_id) +
ylab("Log(Coverage)") +
ggtitle("Mammalian orthoreovirus 3 (Background 2 deduplicated)") +
guides(fill = guide_legend(title = "Viral load")) +
theme_bw() +
scale_y_log10() +
scale_fill_manual(values = cols4,
labels = c("1e+02", "1e+03", "1e+05"))

ggarrange(REO_ME1, REO_ME2, ncol = 2, common.legend = TRUE)

#ggsave("figures/manuscript_figures_pdf/FigureS10.pdf",width=20,height=18)

——————–

>>>> Figure 3: all viruses - target and off target

Read-in data


# read in metadata

metadata <-
  read.csv("metadata/sampleIDs_TESpikeIn.csv", header = TRUE)

metadata2 <-
  metadata |>
  select(
    Sample.ID,
    Background.sample,
    Viral.load,
    Number.of.read.pairs..quality.adaptor.trimmed.
  ) |>
  rename(Sample_id = Sample.ID) |>
  rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)

#import viral reads mapped (to calculate proportions)

viral_reads_dedup <-
  read.csv(
    "data_TE/total_virus_mapped_reads_per_sample_dedup_atcc_ref_20241108.csv",
    header = TRUE
  )

viral_reads_nodedup <-
  read.csv(
    "data_TE/total_virus_mapped_reads_per_sample_nodedup_atcc_ref_20241108.csv",
    header = TRUE
  )

#import contingency tables

contingency_dedup <-
  read.csv(
    "data_TE/contingency_table_mapped_virus_reads_per_family_genus_sample_dedup_atcc_ref_20241106.csv",
    header = TRUE
  )

contingency_nodedup <-
  read.csv(
    "data_TE/contingency_table_mapped_virus_reads_per_family_genus_sample_nodedup_atcc_ref_20241106.csv",
    header = TRUE
  )

contaminants <-
  c("Betacoronavirus", "Alphainfluenzavirus", "Gammaretrovirus")

Format data


# pivot longer  
dedup_long <-
  contingency_dedup |>
  filter(!genus %in% contaminants) |>
  filter(genus != "NA") |>
  pivot_longer(cols = A:P,
               names_to = "Sample_id",
               values_to = "count") |>
  mutate(
    target = case_when(
      genus == "Cyclovirus" ~ "off_target",
      genus == "Cardiovirus" ~ "off_target",
      genus == "Kobuvirus" ~ "off_target",
      .default = "on_target"
    )
  ) |>
  mutate(genome_structure = case_when((genus == "Mastadenovirus" |
                                         genus == "Cytomegalovirus") ~ "DNA",
                                      .default = "RNA"
  ))

no_dedup_long <- contingency_nodedup |>
  filter(!genus %in% contaminants) |>
  filter(genus != "NA") |>
  pivot_longer(cols = A:P,
               names_to = "Sample_id",
               values_to = "count") |>
  mutate(
    target = case_when(
      genus == "Cyclovirus" ~ "off_target",
      genus == "Cardiovirus" ~ "off_target",
      genus == "Kobuvirus" ~ "off_target",
      .default = "on_target"
    )
  ) |>
  mutate(genome_structure = case_when((genus == "Mastadenovirus" |
                                         genus == "Cytomegalovirus") ~ "DNA",
                                      .default = "RNA"
  ))

metadata_dedup <- left_join(dedup_long, metadata2, by = "Sample_id")

dedup_viral_reads <-
  left_join(metadata_dedup, viral_reads_dedup, by = "Sample_id") |>
  mutate(total_reads = QC_reads * 2) |>
  mutate(prop_total = count / total_reads) |>
  mutate(prop_viral = count / total_virus_reads)

metadata_no_dedup <- left_join(no_dedup_long, metadata2, by = "Sample_id")

nodedup_viral_reads <-
  left_join(metadata_no_dedup, viral_reads_nodedup, by = "Sample_id") |>
  mutate(total_reads = QC_reads * 2) |>
  mutate(prop_total = count / total_reads) |>
  mutate(prop_viral = count / total_virus_reads)

Make plots

Dedup – viral reads, prop all reads, and prop viral reads

Final plot

—-nodedup results —-

——————–

---
title: "target_offtarget"
output: html_notebook
editor_options: 
  chunk_output_type: inline
---

This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. 

Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Cmd+Shift+Enter*. 

```{r}
##all viruses - target and off target

library(tidyverse)
library(ggpubr)
library(tidyplots)
library(ggthemes)

```


## >>>> Figure 1: plots of TE experiment data - read counts and read depths

### Read-in data
```{r}
#ATCC genomes updated version
#October 2024

#viral load vs read count normalised using different methods, comparing deduplicated and non-deduplicated
#use bowtie2 data

####read counts/normalised read counts####

#metadata

metadata <- read.csv("metadata/sampleIDs_TESpikeIn.csv", header = TRUE)

metadata2 <- metadata |>
  select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
  rename(Sample_id = Sample.ID) |>
  rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)

#import and combine read count files
#deduplicated and non-deduplicated

counts_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_nodedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_bt_all <- rbind(counts_dedup_bt, counts_nodedup_bt)

counts_reads <- left_join(counts_bt_all, metadata2, by = "Sample_id")

#normalise by both raw read count and genome length - should be the same as mean read depth

counts_reads_norm <- counts_reads |>
  mutate(norm_counts1 = matched / QC_reads) |>
  mutate(norm_counts2 = matched / QC_reads / length) |>
  mutate(norm_counts3 = matched / length) |>
  mutate(genome_structure = case_when((
    virus == "Human_adenovirus_40" |
      virus == "Human_betaherpesvirus"
  ) ~ "DNA",
  .default = "RNA"
  ))

```


### Format data
```{r}
#change labels in facet plots

facet_names <- c("dedup_TE" = "Deduplicated",
                 "nodedup_TE" = "Non-Deduplicated")

cols <- c("#4477AA",
          "#66CCEE",
          "#228833",
          "#CCBB44",
          "#EE6677",
          "#AA3377")
```

### Make plots
```{r}

#plot viral read counts (non normalised)

read_count <- 
  counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log(matched),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 0, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_x_log10() +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  ylab("Log(Viral Reads)")

#ggsave("figures/compare_spike_ins_atcc/read_counts.pdf",width=8,height=6)

#normalise by raw read count

read_count_norm1 <- counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log(norm_counts1),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 0, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_x_log10() +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  ylab("Log(viral reads/cleaned reads)")

#normalise by raw read count and genome length

read_count_norm2 <- counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log(norm_counts2),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 0, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_x_log10() +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  ylab("Log(viral reads/cleaned reads/genome length)")


```

### plot just the two lefthand panels from the original figure
```{r, warning=FALSE, echo=FALSE, message=FALSE, fig.height=10, fig.width=8}

ggarrange(read_count,
          read_count_norm2,
          nrow = 2,
          common.legend = TRUE)

#ggsave("figures/compare_spike_ins_atcc/log_read_count_depth_2panels.png",width=10,height=7)


```

### plot viral read counts (non normalised)
```{r}

##deduplicated only

read_count_dedup <- 
  counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  filter(type == "dedup_TE") |>
  ggplot(aes(
    x = Viral.load,
    y = log(matched),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 0, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_x_log10() +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  ylab("Log(Viral Reads)") +
  xlab("Viral load")

#normalise by raw read count and genome length

read_count_norm2_dedup <- 
  counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  filter(type == "dedup_TE") |>
  ggplot(aes(
    x = Viral.load,
    y = log(norm_counts2),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 0, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_x_log10() +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  ylab("Log(viral reads/cleaned reads/genome length)") +
  xlab("Viral load")

```

### ***Final plot***
```{r, warning=FALSE, message=FALSE, echo=FALSE, fig.height=6, fig.height=8}
#plot just the two lefthand panels from the original figure

ggarrange(
  read_count_dedup,
  read_count_norm2_dedup,
  nrow = 2,
  common.legend = TRUE,
  align = "hv"
)

# ggsave("figures/compare_spike_ins_atcc/log_read_count_depth_2panels_dedup.png",
#        width=10,
#        height=7)

ggsave("figures/manuscript_figure_2025/PDF/Figure_1.pdf",
       width=6,
       height=8)

ggsave("figures/manuscript_figure_2025/PNG/Figure_1.png",
       width=6,
       height=8)

```


### --------------------

## Figure S1

### Read depths
```{r}
####read depths####

#import and combine read depth files

depth_dedup_bt <-
  read_tsv(
    "data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv"
  )

depth_nodedup_bt <-
  read_tsv(
    "data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv"
  )

depths_bt_all <- rbind(depth_dedup_bt, depth_nodedup_bt)

depths_reads <- 
  depths_bt_all |>
  left_join(metadata2, by = "Sample_id") |>
  mutate(genome_structure = case_when((
    virus == "Human_adenovirus_40" |
      virus == "Human_betaherpesvirus"
  ) ~ "DNA",
  .default = "RNA"
  )) |>
  rename(Viral.load = `Viral load`)

```

### Make plots
```{r}

read_depths <- 
  depths_reads |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log(mean_depth),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 0, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  scale_x_log10() +
  ylab("Log(mean read depth)")

#ggsave("figures/compare_spike_ins_atcc/mean_depth.pdf",width=8,height=6)

```

### ***Final plot***
```{r}

ggarrange(
  read_count,
  read_count_norm1,
  read_count_norm2,
  read_depths,
  nrow = 2,
  ncol = 2,
  common.legend = TRUE,
  align = "hv"
)

#ggsave("figures/compare_spike_ins_atcc/log_read_count_depth.png",width=10,height=7)

#ggsave("figures/manuscript_figures_pdf/FigureS1.pdf",width=10,height=7)

#apparently normalised read count using method 2 should not actually be the same

```


### --------------------

## Figure S2

### Reaed-in data / Make plots
```{r}
#metadata

metadata <-
  read.csv(
    "metadata/sampleIDs_TESpikeIn.csv",
    header = TRUE
  )

metadata2 <- metadata |>
  select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
  rename(Sample_id = Sample.ID) |>
  rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)

#import and combine read count files
#deduplicated and non-deduplicated

counts_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_nodedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_bt_all <- rbind(counts_dedup_bt, counts_nodedup_bt)

counts_reads <- left_join(counts_bt_all, metadata2, by = "Sample_id")

#normalise by both raw read count and genome length - should be the same as mean read depth

counts_reads_norm <- counts_reads |>
  mutate(norm_counts1 = matched / QC_reads) |>
  mutate(norm_counts2 = matched / QC_reads / length) |>
  mutate(norm_counts3 = matched / length) |>
  mutate(genome_structure = case_when((
    virus == "Human_adenovirus_40" |
      virus == "Human_betaherpesvirus"
  ) ~ "DNA",
  .default = "RNA"
  ))

####compare library capture pool####

cols3 <- c("#228833", "#AA3377")

facet_names <- c("dedup_TE" = "Deduplicated",
                 "nodedup_TE" = "Non-Deduplicated")

metadata3 <- 
  metadata |>
  select(Sample.ID, Pool.for.sequencing) |>
  rename(Sample_id = Sample.ID) |>
  rename(Pool = Pool.for.sequencing)

counts_pool <- left_join(counts_reads_norm, metadata3, by = "Sample_id")

```

### Make plots
```{r}

pool_readcounts <- 
  counts_pool |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(x = virus, y = log(matched))) +
  geom_boxplot() +
  facet_grid(Pool ~ type) +
  theme_few() +
  theme(
    # axis.title.x = element_blank(),
    # axis.text.x = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  ylab("Log (Viral Reads)") + xlab("Viral load (copies / ml)") +
  scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2"))

pool_counts_sum <- 
  counts_pool |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = as.character(Viral.load),
    y = log(matched),
    colour = Pool
  )) +
  geom_boxplot() +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    # axis.title.x = element_blank(),
    # axis.text.x = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  ylab("Log (Viral Reads)") + xlab("Viral load (copies / ml)") +
  scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2"))

pool_readcounts_norm <- 
  counts_pool |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(x = virus, y = log(norm_counts1))) +
  geom_boxplot() +
  facet_grid(Pool ~ type) +
  theme_few() +
  theme(
    # axis.title.x = element_blank(),
    # axis.text.x = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  ylab("Log(viral reads/cleaned reads)") +  xlab("Viral load (copies / ml)")

pool_norm1_sum <- 
  counts_pool |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = as.character(Viral.load),
    y = log(matched),
    colour = Pool
  )) +
  geom_boxplot() +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    # axis.title.x = element_blank(),
    # axis.text.x = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  ylab("Log(viral reads/cleaned reads)") +   xlab("Viral load (copies / ml)") +
  scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2"))

pool_readcounts_norm2 <- 
  counts_pool |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(x = virus, y = log(norm_counts2))) +
  geom_boxplot() +
  facet_grid(Pool ~ type) +
  theme_few() +
  theme(
    # axis.title.x = element_blank(),
    # axis.text.x = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  ylab("Log(viral reads/cleaned reads/genome length)") + xlab("Viral load (copies / ml)")


pool_norm2_sum <- 
  counts_pool |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = as.character(Viral.load),
    y = log(matched),
    colour = Pool
  )) +
  geom_boxplot() +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    # axis.title.x = element_blank(),
    # axis.text.x = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  ylab("Log(viral reads/cleaned reads/genome length)") +
  scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2")) + 
  xlab("Viral load (copies / ml)")


#import and combine read depth files

depth_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

depth_nodedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

depths_bt_all <- rbind(depth_dedup_bt, depth_nodedup_bt)

depths_reads <- 
  left_join(depths_bt_all, metadata2, by = "Sample_id") |>
  mutate(genome_structure = case_when((
    virus == "Human_adenovirus_40" |
      virus == "Human_betaherpesvirus"
  ) ~ "DNA",
  .default = "RNA"
  ))

depths_pool <- left_join(depths_reads, metadata3, by = "Sample_id")

pool_readdepths <- 
  depths_pool |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(x = virus, y = log(mean_depth))) +
  geom_boxplot() +
  facet_grid(Pool ~ type) +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  ylab("Log (mean read depth)") + xlab("Viral load (copies / ml)")

pool_depths_sum <- 
  depths_pool |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = as.character(Viral.load),
    y = log(mean_depth),
    colour = Pool
  )) +
  geom_boxplot() +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    # axis.title.x = element_blank(),
    # axis.text.x = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  ylab("Log (mean read depth)") +
  xlab("Viral load (copies / ml)") +
  scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2"))

```

### Final plot
```{r}
ggarrange(
  pool_counts_sum,
  pool_norm1_sum,
  pool_norm2_sum,
  pool_depths_sum,
  nrow = 2,
  ncol = 2,
  common.legend = TRUE,
  align = "hv"
)

#ggsave("figures/compare_spike_ins_atcc/pool_compare_viruses_sum.png",width=10,height=7)

#ggsave("figures/manuscript_figures_pdf/FigureS2.pdf",width=10,height=7)

```

### --------------------

## Figure S3
```{r}

##plots of read counts and viral read counts
#November 2024

#metadata

metadata <-
  read.csv(
    "metadata/sampleIDs_TESpikeIn.csv",
    header = TRUE
  )

metadata2 <- metadata |>
  select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
  rename(Sample_id = Sample.ID) |>
  rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)

#import read count files

counts_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_nodedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

#import viral reads mapped (to calculate proportions)

viral_reads_dedup <-
  read.csv(
    "data_TE/total_virus_mapped_reads_per_sample_dedup_atcc_ref_20241108.csv",
    header = TRUE
  )

viral_reads_nodedup <-
  read.csv(
    "data_TE/total_virus_mapped_reads_per_sample_nodedup_atcc_ref_20241108.csv",
    header = TRUE
  )

cols2 <- c("#BB5566", "#004488")

facet_names <- c("dedup_TE" = "Deduplicated",
                 "nodedup_TE" = "Non-Deduplicated")

reads_metadata_dedup <-
  left_join(counts_dedup_bt, metadata2, by = "Sample_id")

reads_viral_dedup <-
  left_join(reads_metadata_dedup, viral_reads_dedup, by = "Sample_id")

reads_metadata_nodedup <-
  left_join(counts_nodedup_bt, metadata2, by = "Sample_id")

reads_viral_nodedup <-
  left_join(reads_metadata_nodedup, viral_reads_nodedup, by = "Sample_id")

reads_viral_all <- rbind(reads_viral_dedup, reads_viral_nodedup)


reads_plot <- 
  reads_viral_all |>
  group_by(Background, Sample_id, Viral.load, type) |>
  summarise(
    total_reads = (QC_reads * 2),
    viral_reads = total_virus_reads,
    ATCC_reads = sum(matched),
    prop_ATCC = ATCC_reads / total_reads,
    prop_viral = total_virus_reads / total_reads,
    diff = viral_reads - ATCC_reads
  ) |>
  unique()

```

### Make plots
```{r, echo=FALSE, warning=FALSE, message=FALSE}

total_reads <- 
  reads_plot |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log(total_reads),
    colour = Background
  )) +
  geom_point() +
  geom_smooth(aes(group = Background), se = FALSE) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_x_log10(labels = scales::label_log()) + xlab("Viral load (copies / ml)") +
  ggtitle("Total Reads") +
  ylab("Log(Total Reads)") +
  scale_color_manual(values = cols2, labels = c("P1", "P2"))

viral_reads <- 
  reads_plot |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log(viral_reads),
    colour = Background
  )) +
  geom_point() +
  geom_smooth(aes(group = Background), se = FALSE) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_x_log10(labels = scales::label_log()) + xlab("Viral load (copies / ml)") +
  ggtitle("Total Viral Reads") +
  ylab("Log(Total Viral Reads)") +
  scale_color_manual(values = cols2, labels = c("P1", "P2"))

ATCC_reads <- 
  reads_plot |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log(ATCC_reads),
    colour = Background
  )) +
  geom_point() +
  geom_smooth(aes(group = Background), se = FALSE) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_x_log10(labels = scales::label_log()) + xlab("Viral load (copies / ml)") +
  ggtitle("Spike In Viral Reads") +
  ylab("Log(Spike In Viral Reads)") +
  scale_color_manual(values = cols2, labels = c("P1", "P2"))

prop_ATCC <- reads_plot |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(x = Viral.load, y = prop_ATCC, colour = Background)) +
  geom_point() +
  geom_smooth(aes(group = Background), se = FALSE) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_x_log10(labels = scales::label_log()) + xlab("Viral load (copies / ml)") +
  ggtitle("Proportion Spike In Viral") +
  ylab("Spike In Viral Reads/Total Reads") +
  scale_color_manual(values = cols2, labels = c("P1", "P2"))

prop_viral <- reads_plot |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(x = Viral.load, y = prop_viral, colour = Background)) +
  geom_point() +
  geom_smooth(aes(group = Background), se = FALSE) +
  facet_grid( ~ type, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_x_log10(labels = scales::label_log()) + xlab("Viral load (copies / ml)") +
  ggtitle("Proportion Viral") +
  ylab("Viral Reads/Total Reads") +
  scale_color_manual(values = cols2, labels = c("P1", "P2"))


#ggarrange(total_reads,viral_reads,ATCC_reads,prop_viral,prop_ATCC,nrow=2,ncol=3,common.legend = TRUE)

```

### ***Final plot***
```{r}
ggarrange(total_reads,
          viral_reads,
          prop_viral,
          nrow = 3,
          common.legend = TRUE,
          align = "hv")

#ggsave("figures/compare_spike_ins_atcc/backgrounds_reads.png")

#ggsave("figures/manuscript_figures_pdf/FigureS3.pdf")

####compare proportion viral reads per pool with shotgun####

# metadata3 <- metadata |>
#   select(Sample.ID,
#          Number.of.read.pairs..quality.adaptor.trimmed.,
#          Pool.for.sequencing) |>
#   rename(Sample_id = Sample.ID) |>
#   rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)
# 
# reads_viral_dedup3 <-
#   full_join(metadata3, viral_reads_dedup, by = "Sample_id")
# 
# reads_viral_nodedup3 <-
#   full_join(metadata3, viral_reads_nodedup, by = "Sample_id")
# 
# reads_viral_dedup3 |>
#   group_by(Pool.for.sequencing) |>
#   summarise(
#     viral_reads = sum(total_virus_reads),
#     total_reads = sum(QC_reads * 2),
#     prop_viral = viral_reads / total_reads
#   )
# 
# polyomics <-
#   read.csv(
#     "data_polyomics/total_virus_mapped_reads_per_sample_dedup.csv"
#   )
# 
# #read data from polyomics_indexes_stefano document
# total_reads <- data.frame(total_reads = c(5942760, 7714275))
# 
# keeps <- c("RNA-Msp-p2", "RNA-Msp-p8")
# 
# polyomics_read_prop <- polyomics |>
#   filter(Sample_id %in% keeps)
# 
# polyomics_read_prop2 <- cbind(polyomics_read_prop, total_reads)
# 
# polyomics_read_prop2 |>
#   summarise(prop_viral = total_virus_reads / total_reads)

```

### --------------------

## Figure S4

### Make plots
```{r}
####read counts/depths split by background####

#change labels in facet plots

facet_names_bg <- c(
  "dedup_TE" = "Deduplicated",
  "nodedup_TE" = "Non-Deduplicated",
  "p2" = "P1",
  "p8" = "P2"
)

#break down by Background x virus

background_counts_reads <- 
  counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log(matched),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels =
      c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
      )
  ) +
  scale_x_log10(labels = scales::label_log()) +
  ylab("Log(Viral Reads)")

backgrounds_counts_reads_norm <- 
  counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log(norm_counts1),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
  theme_bw() +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  scale_x_log10(label = scales::label_log()) +
  ylab("Log(viral reads/cleaned reads)")

backgrounds_counts_reads_norm2 <- 
  counts_reads_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log(norm_counts2),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_text(size = 10),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  scale_x_log10(labels = scales::label_log()) +
  ylab("Log(viral reads/cleaned reads/genome length)")

background_read_depths <- 
  depths_reads |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  ggplot(aes(
    x = Viral.load,
    y = log(mean_depth),
    colour = virus
  )) +
  geom_point() +
  geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
  facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
  theme_few() +
  theme(
    axis.title.x = element_blank(),
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray"),
    axis.title.y = element_text(size = 10),
    legend.title = element_blank()
  ) +
  scale_color_manual(
    values = cols,
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  scale_x_log10(label = scales::label_log()) +
  ylab("Log(mean read depth)")

```

### ***Final plot*** 
```{r, warning=FALSE, message=FALSE, echo=FALSE, fig.height=6, fig.height=8}

ggarrange(
  background_counts_reads,
  backgrounds_counts_reads_norm,
  backgrounds_counts_reads_norm2,
  background_read_depths,
  nrow = 2,
  ncol = 2,
  common.legend = TRUE,
  align = "hv"
)

#ggsave("figures/compare_spike_ins_atcc/backgrounds_read_count_depth.png",width=10,height=7)

#ggsave("figures/manuscript_figures_pdf/FigureS4.pdf",width=10,height=7)

```


### --------------------

### Figure S5 - linear model

#### Read-in data
```{r}
#Linear model for TE data - separated by virus
#November 2024

#viral read depth
#use bowtie2 data

library(tidyverse)
library(broom)
library(boot)

####read counts/normalised read counts####

#metadata

metadata <-
  read.csv(
    "metadata/sampleIDs_TESpikeIn.csv",
    header = TRUE
  )

metadata2 <- metadata |>
  select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
  rename(Sample_id = Sample.ID) |>
  rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)

#import and combine read count files
#deduplicated and non-deduplicated

counts_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_nodedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

counts_bt_all <- rbind(counts_dedup_bt, counts_nodedup_bt)

counts_reads <- left_join(counts_bt_all, metadata2, by = "Sample_id")

####read depths####

#import and combine read depth files

depth_dedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

depth_nodedup_bt <-
  read.table(
    "data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
    sep = "\t",
    header = TRUE
  )

```

#### Format data
```{r}

depths_bt_all <- rbind(depth_dedup_bt, depth_nodedup_bt)

depths_reads <- left_join(depths_bt_all, metadata2, by = "Sample_id") |>
  mutate(genome_structure = case_when((
    virus == "Human_adenovirus_40" |
      virus == "Human_betaherpesvirus"
  ) ~ "DNA",
  ,
  .default = "RNA"
  ))

####linear model for spike in viruses - mean read depth####

depths_reads_sub <- depths_reads |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  filter(type == "dedup_TE") |>
  mutate(log_depth = log(mean_depth)) |>
  mutate(log_viral_load = log10(Viral.load))

#inspect data
hist(depths_reads_sub$mean_depth)
hist(depths_reads_sub$Viral.load)
hist(depths_reads_sub$log_depth)
hist(depths_reads_sub$log_viral_load)

summary(depths_reads_sub$log_depth)
summary(depths_reads_sub$log_viral_load)

```

#### model
```{r}
#all viruses together

lm1 <-
  glm(log_depth ~ log_viral_load * virus,
      data = depths_reads_sub,
      family = "gaussian")

summary(lm1)

glm.diag.plots(lm1)

pred <- predict(lm1, type = "response")

rsq <- function (x, y) {
  cor(x, y) ^ 2
}

rsq(depths_reads_sub$log_depth, pred)

##split by viruses

cols <- c("#4477AA",
          "#66CCEE",
          "#228833",
          "#CCBB44",
          "#EE6677",
          "#AA3377")

facet_names <- c(
  "Human_adenovirus_40" = "Human adenovirus 40",
  "Human_betaherpesvirus" = "Human betaherpesvirus",
  "Human_respiratory_syncytial_virus" = "Human respiratory syncytial virus",
  "Influenza_B_virus" = "Influenza B virus",
  "Mammalian_orthoreovirus3" = "Mammalian orthoreovirus 3",
  "Zika_virus" = "Zika virus"
)

virus <- (unique(depths_reads_sub$virus)) |>
  rep(each = 2) |>
  data.frame() |>
  rename(virus = rep..unique.depths_reads_sub.virus....each...2.)

list_models <- 
  depths_reads_sub |>
  group_split(virus) |>
  map( ~ lm(log_depth ~ log_viral_load, data = .))

lm_tidy <- 
  map(list_models, broom::tidy) |>
  do.call(rbind.data.frame, .) |>
  cbind(virus, .) |>
  rename(virus = ".") |>
  select(virus, term, estimate) |>
  pivot_wider(names_from = "term", values_from = "estimate")

lm_summary <- depths_reads_sub |>
  group_by(virus) |>
  summarise(
    Intercept = lm(log_depth ~ log_viral_load)$coefficients[1],
    Coeff_x1 = lm(log_depth ~ log_viral_load)$coefficients[2],
    R2 = summary(lm(log_depth ~ log_viral_load))$r.squared,
    pvalue = summary(lm(log_depth ~ log_viral_load))$coefficients["log_viral_load", 4]
  )

lm_combined <- left_join(lm_tidy, lm_summary, by = "virus")

ggplot() +
  geom_point(data = depths_reads_sub,
             mapping = aes(log_viral_load, log_depth, color = virus)) +
  geom_abline(data = lm_combined, aes(intercept = `(Intercept)`, slope = log_viral_load)) +
  geom_text(data = lm_combined,
            colour = "black",
            aes(
              x = 4.7,
              y = 3,
              label = round(Coeff_x1, digits = 2)
            )) +
  geom_text(data = lm_combined,
            colour = "black",
            aes(x = 4, y = 3, label = "Slope =")) +
  geom_text(data = lm_combined,
            colour = "black",
            aes(
              x = 4.7,
              y = 2,
              label = round(R2, digits = 2)
            )) +
  geom_text(data = lm_combined,
            colour = "black",
            aes(x = 4, y = 2, label = "R2 =")) +
  facet_wrap( ~ virus, labeller = as_labeller(facet_names)) +
  theme_bw() +
  xlab("Log(Viral Load)") +
  ylab("Log(Mean Read Depth)") +
  scale_color_manual(values = cols) +
  guides(color = FALSE)

#ggsave("figures/compare_spike_ins_atcc/model_readdepth_dedup.png")

#ggsave("figures/manuscript_figures_pdf/FigureS5.pdf")

```

### --------------------

## >>> Figure 2: plots of TE experiment data - genome coverage & per site coverage

### Read-in data
```{r}

#plots of TE experiment data - genome coverage & per site coverage
#also separated by viruses
#ATCC genomes updated version
#October 2024

#use bowtie2 data

####genome coverage####

unzip(zipfile = "data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_dedup_atcc_ref.tsv.zip", exdir = "data_TE/")

unzip(zipfile = "data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv.zip", exdir = "data_TE/")

dedup_per_site <-
  read_tsv(
    "data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_dedup_atcc_ref.tsv"
  )


nodedup_per_site <-
  read_tsv(
    "data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv"
  )

file.remove("data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_dedup_atcc_ref.tsv")

file.remove( "data_TE/TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv")

#add length column from read depth file to per site file

#metadata

metadata <-
  read.csv(
    "metadata/sampleIDs_TESpikeIn.csv",
    header = TRUE
  )

metadata2 <- 
  metadata |>
  select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
  rename(Sample_id = Sample.ID) |>
  rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)

#import and combine read count files

#deduplicated and non-deduplicated

counts_dedup_bt <-
  read_tsv(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv"
  )

counts_nodedup_bt <-
  read_tsv(
    "data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv"
  )

counts_bt_all <- rbind(counts_dedup_bt, counts_nodedup_bt)

counts_reads <- left_join(counts_bt_all, metadata2, by = "Sample_id")

```

### Format data
```{r}
#normalise by both raw read count and genome length - should be the same as mean read depth

counts_reads_norm <- 
  counts_reads |>
  mutate(norm_counts1 = matched / QC_reads) |>
  mutate(norm_counts2 = matched / QC_reads / length) |>
  mutate(norm_counts3 = matched / length) |>
  mutate(genome_structure = case_when((
    virus == "Human_adenovirus_40" |
      virus == "Human_betaherpesvirus"
  ) ~ "DNA",
  .default = "RNA"
  ))

lengths <- 
  counts_reads_norm |>
  select(virus, length) |>
  distinct()

#calculate genome coverage

persite_coverage_dedup <- 
  dedup_per_site |>
  rename(Viral.load = `Viral load`) |>
  group_by(Sample_id, virus, Viral.load, Background, type) |>
  filter(coverage > 0) |>
  summarise(genome_coverage = n()) |>
  left_join(lengths) |>
  mutate(percent_coverage = genome_coverage / length)

persite_coverage_nodedup <- 
  nodedup_per_site |>
  rename(Viral.load = `Viral load`) |>
  group_by(Sample_id, virus, Viral.load, Background, type) |>
  filter(coverage > 0) |>
  summarise(genome_coverage = n()) |>
  left_join(lengths) |>
  mutate(percent_coverage = genome_coverage / length)

persite_coverage_both <-
  rbind(persite_coverage_dedup, 
        persite_coverage_nodedup)

coverage_labels <- c("0" = "Control",
                     "100" = "10^{2}",
                     "1000" = "10^{3}",
                     "10000" = "10^{5}",
                     "dedup_TE" = "Deduplicated",
                     "nodedup_TE"= "Non-Deduplicated")

coverage_labels2 <- c("0" = "Control",
                      "100" = "10^{2}",
                      "1000" = "10^{3}",
                      "10000" = "10^{5}")
```

### Make plots
```{r, echo = FALSE, warning=FALSE, message=FALSE, fig.height=4, fig.width=6}

cols2 <- c("#BB5566",
           "#004488")

cols4 <- c("#DDAA33",
           "#BB5566",
           "#004488")


persite_coverage_both$Viral.load <- factor(persite_coverage_both$Viral.load, labels = c("0",
    "10^{2}", 
    "10^{3}", 
    "10^{5}"
    ))


##are the deduplicated and non deduplicated datasets identical??
##if they are the same can maybe just show one
#show deduplicated

persite_coverage_both |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  filter(type == "dedup_TE") |>
  ggplot(aes(x = virus, y = percent_coverage, colour = Background)) +
  geom_boxplot(outlier.shape = NA) +
  geom_point(position = position_dodge(width = .75)) +
  facet_grid(~as.character(Viral.load), labeller = label_parsed) +
  theme_few() +
  ylab("Genome coverage") +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_color_manual(values = cols2, labels = c("P1", "P2")) +
  scale_x_discrete(
    labels = c(
      "Human adenovirus 40",
      "Human betaherpesvirus",
      "Human respiratory syncytial virus",
      "Influenza B virus",
      "Mammalian orthoreovirus 3",
      "Zika virus"
    )
  ) +
  scale_y_continuous(limits = c(0, 1))

```

### ***Final plot***
```{r, echo = FALSE, warning=FALSE, message=FALSE}
#ggsave("figures/compare_spike_ins_atcc/genome_cov_deduponly.png",width=15,height=6)

ggsave(
  "figures/manuscript_figure_2025/PDF/Figure_2.pdf",
  width = 15,
  height = 6
)

ggsave(
  "figures/manuscript_figure_2025/PNG/Figure_2.png",
  width = 15,
  height = 6
)


```


### --------------------


### --------------------

### Figure S6
```{r}

persite_both <- rbind(dedup_per_site, nodedup_per_site)

persite_reads <- left_join(persite_both, metadata2, by = "Sample_id")

persite_norm <- persite_reads |>
  mutate(norm_cov = coverage / QC_reads)

coverage_labels3 <-c("100" = "1e+02",
                    "1000" = "1e+03",
                    "100000" = "1e+05",
                    "p2" = "P1",
                    "p8"= "P2")

persite_norm$`Viral load` <- 
  factor(persite_norm$`Viral load`, 
         labels = c("0",
                    "10^{2}", 
                    "10^{3}",
                    "10^{5}"))


persite_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  filter(type == "dedup_TE") |>
  filter(virus == "Human_respiratory_syncytial_virus") |>
  rename(Viral.load = `Viral load`) |>
  ggplot(aes(x = site, y = coverage, fill = Background)) +
  geom_col() +
  facet_wrap(Viral.load ~ Sample_id, label = label_parsed) +
  ylab("Log(Coverage)") +
  ggtitle("Human RSV (deduplicated)") +
  guides(fill = guide_legend(title = "Background")) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_y_log10() +
  scale_fill_manual(values = cols2, labels = c("P1", "P2"))
```

#### ***Final plot***
```{r}
#ggsave("figures/manuscript_figures_pdf/FigureS5.pdf",width=10,height=8)
```


### --------------------

### Figure S7
```{r}

persite_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  filter(type == "dedup_TE") |>
  filter(virus == "Zika_virus") |>
  ggplot(aes(x = site, y = coverage, fill = Background)) +
  geom_col() +
  facet_wrap(Viral.load ~ Sample_id) +
  ylab("Log(Coverage)") +
  ggtitle("Zika virus (deduplicated)") +
  guides(fill = guide_legend(title = "Background")) +
  theme_bw() +
  scale_y_log10() +
  scale_fill_manual(values = cols2, labels = c("P1", "P2"))

#ggsave("figures/manuscript_figures_pdf/FigureS6.pdf",width=10,height=8)

```


### --------------------

### Figure S8
```{r}
persite_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  filter(type == "dedup_TE") |>
  filter(virus == "Human_adenovirus_40") |>
  ggplot(aes(x = site, y = coverage, fill = Background)) +
  geom_col() +
  facet_wrap(Viral.load ~ Sample_id) +
  ylab("Log(Coverage)") +
  ggtitle("Human Adenovirus (deduplicated)") +
  guides(fill = guide_legend(title = "Background")) +
  theme_bw() +
  scale_y_log10() +
  scale_fill_manual(values = cols2, labels = c("P1", "P2"))

#ggsave("figures/manuscript_figures_pdf/FigureS7.pdf",width=20,height=18)
```


### --------------------

### Figure S9
```{r}
persite_norm |>
  filter(Background != "p6") |>
  filter(Background != "control") |>
  filter(type == "dedup_TE") |>
  filter(virus == "Human_betaherpesvirus") |>
  ggplot(aes(x = site, y = coverage, fill = Background)) +
  geom_polygon() +
  facet_wrap(Viral.load ~ Sample_id) +
  ylab("Log(Coverage)") +
  ggtitle("Human Betaherpesvirus (deduplicated)") +
  guides(fill = guide_legend(title = "Background")) +
  theme_bw() +
  scale_y_log10() +
  scale_fill_manual(values = cols2, labels = c("P1", "P2"))

#ggsave("figures/compare_spike_ins_atcc/HBHV_coverage_test.pdf",width=20,height=18)
```



### --------------------

### Figure S10
```{r}
#have to do these differently due to segmentation

FLU_ME1 <- persite_norm |>
  filter(virus == "Influenza_B_virus") |>
  filter(type == "dedup_TE") |>
  filter(Background == "p2") |>
  ggplot(aes(
    x = site,
    y = coverage,
    fill = as.character(Viral.load)
  )) +
  geom_col() +
  facet_grid(seg ~ Sample_id) +
  ylab("Log(Coverage)") +
  ggtitle("Influenza B virus (Background 1 deduplicated)") +
  guides(fill = guide_legend(title = "Viral load")) +
  theme_bw() +
  scale_y_log10() +
  scale_fill_manual(values = cols4,
                    labels = c("1e+02", "1e+03", "1e+05"))

FLU_ME2 <- persite_norm |>
  filter(virus == "Influenza_B_virus") |>
  filter(type == "dedup_TE") |>
  filter(Background == "p8") |>
  ggplot(aes(
    x = site,
    y = coverage,
    fill = as.character(Viral.load)
  )) +
  geom_col() +
  facet_grid(seg ~ Sample_id) +
  ylab("Log(Coverage)") +
  ggtitle("Influenza B virus (Background 2 deduplicated)") +
  guides(fill = guide_legend(title = "Viral load")) +
  theme_bw() +
  scale_y_log10() +
  scale_fill_manual(values = cols4,
                    labels = c("1e+02", "1e+03", "1e+05"))

ggarrange(FLU_ME1, FLU_ME2, ncol = 2, common.legend = TRUE)

#ggsave("figures/manuscript_figures_pdf/FigureS9.pdf",width=20,height=18)
```


### --------------------

### Figure S11
```{r}

REO_ME1 <- persite_norm |>
filter(virus == "Mammalian_orthoreovirus3") |>
filter(type == "dedup_TE") |>
filter(Background == "p2") |>
ggplot(aes(
x = site,
y = coverage,
fill = as.character(Viral.load)
)) +
geom_col() +
facet_grid(seg ~ Sample_id) +
ylab("Log(Coverage)") +
ggtitle("Mammalian orthoreovirus 3 (Background 1 deduplicated)") +
guides(fill = guide_legend(title = "Viral load")) +
theme_bw() +
scale_y_log10() +
scale_fill_manual(values = cols4,
labels = c("1e+02", "1e+03", "1e+05"))

REO_ME2 <- persite_norm |>
filter(virus == "Mammalian_orthoreovirus3") |>
filter(type == "dedup_TE") |>
filter(Background == "p8") |>
ggplot(aes(
x = site,
y = coverage,
fill = as.character(Viral.load)
)) +
geom_col() +
facet_grid(seg ~ Sample_id) +
ylab("Log(Coverage)") +
ggtitle("Mammalian orthoreovirus 3 (Background 2 deduplicated)") +
guides(fill = guide_legend(title = "Viral load")) +
theme_bw() +
scale_y_log10() +
scale_fill_manual(values = cols4,
labels = c("1e+02", "1e+03", "1e+05"))

ggarrange(REO_ME1, REO_ME2, ncol = 2, common.legend = TRUE)

#ggsave("figures/manuscript_figures_pdf/FigureS10.pdf",width=20,height=18)


```




### --------------------

## >>>> Figure 3: all viruses - target and off target

### Read-in data
```{r}

# read in metadata

metadata <-
  read.csv("metadata/sampleIDs_TESpikeIn.csv", header = TRUE)

metadata2 <-
  metadata |>
  select(
    Sample.ID,
    Background.sample,
    Viral.load,
    Number.of.read.pairs..quality.adaptor.trimmed.
  ) |>
  rename(Sample_id = Sample.ID) |>
  rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)

#import viral reads mapped (to calculate proportions)

viral_reads_dedup <-
  read.csv(
    "data_TE/total_virus_mapped_reads_per_sample_dedup_atcc_ref_20241108.csv",
    header = TRUE
  )

viral_reads_nodedup <-
  read.csv(
    "data_TE/total_virus_mapped_reads_per_sample_nodedup_atcc_ref_20241108.csv",
    header = TRUE
  )

#import contingency tables

contingency_dedup <-
  read.csv(
    "data_TE/contingency_table_mapped_virus_reads_per_family_genus_sample_dedup_atcc_ref_20241106.csv",
    header = TRUE
  )

contingency_nodedup <-
  read.csv(
    "data_TE/contingency_table_mapped_virus_reads_per_family_genus_sample_nodedup_atcc_ref_20241106.csv",
    header = TRUE
  )

contaminants <-
  c("Betacoronavirus", "Alphainfluenzavirus", "Gammaretrovirus")

```

### Format data
```{r}

# pivot longer  
dedup_long <-
  contingency_dedup |>
  filter(!genus %in% contaminants) |>
  filter(genus != "NA") |>
  pivot_longer(cols = A:P,
               names_to = "Sample_id",
               values_to = "count") |>
  mutate(
    target = case_when(
      genus == "Cyclovirus" ~ "off_target",
      genus == "Cardiovirus" ~ "off_target",
      genus == "Kobuvirus" ~ "off_target",
      .default = "on_target"
    )
  ) |>
  mutate(genome_structure = case_when((genus == "Mastadenovirus" |
                                         genus == "Cytomegalovirus") ~ "DNA",
                                      .default = "RNA"
  ))

no_dedup_long <- contingency_nodedup |>
  filter(!genus %in% contaminants) |>
  filter(genus != "NA") |>
  pivot_longer(cols = A:P,
               names_to = "Sample_id",
               values_to = "count") |>
  mutate(
    target = case_when(
      genus == "Cyclovirus" ~ "off_target",
      genus == "Cardiovirus" ~ "off_target",
      genus == "Kobuvirus" ~ "off_target",
      .default = "on_target"
    )
  ) |>
  mutate(genome_structure = case_when((genus == "Mastadenovirus" |
                                         genus == "Cytomegalovirus") ~ "DNA",
                                      .default = "RNA"
  ))

metadata_dedup <- left_join(dedup_long, metadata2, by = "Sample_id")

dedup_viral_reads <-
  left_join(metadata_dedup, viral_reads_dedup, by = "Sample_id") |>
  mutate(total_reads = QC_reads * 2) |>
  mutate(prop_total = count / total_reads) |>
  mutate(prop_viral = count / total_virus_reads)

metadata_no_dedup <- left_join(no_dedup_long, metadata2, by = "Sample_id")

nodedup_viral_reads <-
  left_join(metadata_no_dedup, viral_reads_nodedup, by = "Sample_id") |>
  mutate(total_reads = QC_reads * 2) |>
  mutate(prop_total = count / total_reads) |>
  mutate(prop_viral = count / total_virus_reads)

```

### Make plots
```{r, warning=FALSE, message=FALSE, echo=FALSE}

facet_names <- c(
  "Msp_p2" = "P1",
  "Msp_p8" = "P2",
  "off_target" = "Off target viruses",
  "on_target" = "On target viruses"
)

cols <-
  c(
    "#CCBB44",
    "#332288",
    "#EE7733",
    "#66CCEE",
    "#882255",
    "#4477AA",
    "#AA3377",
    "#228833",
    "#EE6677"
  )


dedup_reads <- 
  dedup_viral_reads |>
  filter(Background.sample != "Neg_control") |>
  filter(Background.sample != "Msp_p6") |>
  ggplot(aes(
    x = Viral.load,
    y = log(count),
    colour = genus
  )) +
  geom_point() +
  geom_smooth(aes(group = genus, linetype = genome_structure), se = FALSE) +
  facet_grid(target ~ Background.sample, labeller = as_labeller(facet_names)) +
  theme_few() + 
  scale_x_log10() +
  ylab("Log (Viral Reads)") + 
  xlab("Log (Viral load)") + 
  scale_color_manual(values = cols) +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  )

dedup_prop_viral <-
  dedup_viral_reads |>
  filter(Background.sample != "Neg_control") |>
  filter(Background.sample != "Msp_p6") |>
  ggplot(aes(x = Viral.load, y = prop_viral, colour = genus)) +
  geom_point() +
  geom_smooth(aes(group = genus, linetype = genome_structure), se = FALSE) +
  facet_grid(target ~ Background.sample, labeller = as_labeller(facet_names)) +
  theme_few() + 
  scale_x_log10() +
  scale_y_log10() +
  ylab("Proportion of viral reads") +
  xlab("Log (Viral load)") + 
  scale_color_manual(values = cols) +
    theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  )

dedup_prop_all <-
  dedup_viral_reads |>
  filter(Background.sample != "Neg_control") |>
  filter(Background.sample != "Msp_p6") |>
  ggplot(aes(x = Viral.load, y = prop_total, colour = genus)) +
  geom_point() +
  geom_smooth(aes(group = genus, linetype = genome_structure), se = FALSE) +
  facet_grid(target ~ Background.sample, labeller = as_labeller(facet_names)) +
  theme_few() + 
  scale_x_log10() +
  scale_y_log10(limits=c(5e-8,1)) +
  ylab("Proportion of total reads") +
  xlab("Log (Viral load)") + 
  scale_color_manual(values = cols) +
    theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 30, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  )

```

### Dedup -- viral reads, prop all reads, and prop viral reads
```{r, warning=FALSE, message=FALSE, echo=FALSE, fig.width=8, fig.height=12}

ggarrange(
  dedup_reads,
  dedup_prop_all,
  dedup_prop_viral,
  nrow = 3,
  common.legend = TRUE,
  align = "hv"
)

```

### ***Final plot***

```{r, warning=FALSE, message=FALSE, echo=FALSE, fig.width=7, fig.height=8}

#ggsave("figures/compare_spike_ins_atcc/target_offtarget_dedup.png",width=8,height=10)

##just plot viral reads and prop viral

ggarrange(
  dedup_reads,
  dedup_prop_viral,
  nrow = 2,
  common.legend = TRUE,
  align = "hv"
)

ggsave("figures/compare_spike_ins_atcc/target_offtarget_dedup_2025-01-01.png", width = 10, height = 8)

ggsave("figures/manuscript_figures_pdf/Figure_3_2025-01-01.pdf", width = 10, height = 8)

```

### ----nodedup results ----
```{r, echo=FALSE, message=FALSE, warning=FALSE, fig.height=12, fig.width=6}
##extras - no dedup

nodedup_reads <-
  nodedup_viral_reads |>
  filter(Background.sample != "Neg_control") |>
  filter(Background.sample != "Msp_p6") |>
  ggplot(aes(
    x = Viral.load,
    y = log(count),
    colour = genus
  )) +
  geom_point() +
  geom_smooth(aes(group = genus, linetype = genome_structure), se = FALSE) +
  facet_grid(target ~ Background.sample, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 0, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_x_log10() +
  ylab("Log(Viral Reads)") +
  scale_color_manual(values = cols)

nodedup_prop_viral <- 
  nodedup_viral_reads |>
  filter(Background.sample != "Neg_control") |>
  filter(Background.sample != "Msp_p6") |>
  ggplot(aes(x = Viral.load, y = prop_viral, colour = genus)) +
  geom_point() +
  geom_smooth(aes(group = genus, linetype = genome_structure), se = FALSE) +
  facet_grid(target ~ Background.sample, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 0, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_x_log10() +
  scale_y_log10() +
  ylab("Proportion of viral reads") +
  scale_color_manual(values = cols)

nodedup_prop_all <- nodedup_viral_reads |>
  filter(Background.sample != "Neg_control") |>
  filter(Background.sample != "Msp_p6") |>
  ggplot(aes(x = Viral.load, y = prop_total, colour = genus)) +
  geom_point() +
  geom_smooth(aes(group = genus, linetype = genome_structure), se = FALSE) +
  facet_grid(target ~ Background.sample, labeller = as_labeller(facet_names)) +
  theme_few() +
  theme(
    axis.title.x = element_text(size = 12),
    axis.text.x = element_text(angle = 0, hjust = 1),
    axis.title.y = element_text(size = 12),
    legend.title = element_blank(),
    legend.position = "top", 
    panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
    panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
  ) +
  scale_x_log10() +
  scale_y_log10() +
  ylab("Proportion of total reads") +
  scale_color_manual(values = cols)

ggarrange(
  nodedup_reads,
  nodedup_prop_all,
  nodedup_prop_viral,
  nrow = 3,
  common.legend = TRUE,
  align = "hv"
)

#ggsave("figures/compare_spike_ins_atcc/target_offtarget_nodedup.pdf",width=8,height=10)

```

### --------------------

